import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(url).text
soup = BeautifulSoup(req, 'html')
# The contents of the table cell for reference
print(soup.find_all('td')[6].text)
soup.find_all('td')[6]
M7AQueen's Park(Ontario Provincial Government)
<td style="width:11%; vertical-align:top;"> <p><b>M7A</b><br/><span style="font-size:85%;"><a href="/wiki/Queen%27s_Park_(Toronto)" title="Queen's Park (Toronto)">Queen's Park</a><br/>(Ontario Provincial Government)</span> </p> </td>
# First let's make a list that will contain all the information
comm = []
# Run through all the instances of a table cell
for instance in soup.find_all('td'):
# Initiate the dictionary that will hold the cell's data
commDict = {}
# The try will kick out if there is no information in the cell and make no entry
try:
# The postal code is wrapped up in the first Bold tag <b></b>
commDict['PostalCode'] = instance.b.text
# If the cell is unassaigned it will not allocate any information
if instance.span.text == 'Not assigned':
pass
else:
data = instance.span.text.split(')')
# Now the Borough is in the first bit of the text before the '('
commDict['Borough'] = instance.span.text[:instance.span.text.find('(')]
commDict['Neighborhood'] = ''
for area in data:
if commDict['Neighborhood'] != '' and len(area.split('(')>1):
commDict['Neighborhood'] = commDict['Neighborhood'] + ','
# The neighborhoods content is in between the cells and seperated by ' / '
commDict['Neighborhood'] = commDict['Neighborhood'] + area[instance.span.text.find('(')+1:].replace(' / ',',')
except:
pass
# Now we read the data into the list 'comm' if there is information
try:
if commDict['Borough'] is not None:
comm.append(commDict)
except:
pass
# Now we read the dictionary into a dataframe
df_scrape = pd.DataFrame(comm)
df_scrape
| PostalCode | Borough | Neighborhood | |
|---|---|---|---|
| 0 | M3A | North York | Parkwoods |
| 1 | M4A | North York | Victoria Village |
| 2 | M5A | Downtown Toronto | Regent Park,Harbourfront |
| 3 | M6A | North York | Lawrence Manor,Lawrence Heights |
| 4 | M7A | Queen's Park | Ontario Provincial Government |
| ... | ... | ... | ... |
| 98 | M8X | Etobicoke | The Kingsway,Montgomery Road,Old Mill North |
| 99 | M4Y | Downtown Toronto | Church and Wellesley |
| 100 | M7Y | East TorontoBusiness reply mail Processing Cen... | Enclave of M4L |
| 101 | M8Y | Etobicoke | Old Mill South,King's Mill Park,Sunnylea,Humbe... |
| 102 | M8Z | Etobicoke | Mimico NW,The Queensway West,South of Bloor,Ki... |
103 rows × 3 columns
# Let's remove areas that are just postal addresses
ban_list = ['M7Y','M5W','M7R','M7A']
df_scrape = df_scrape[~df_scrape['PostalCode'].isin(ban_list)]
df_scrape.shape
(99, 3)
import geocoder # import geocoder
df_PScodes = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv")
df_PScodes = df_PScodes.set_index('Postal Code')
df_PScodes
| Latitude | Longitude | |
|---|---|---|
| Postal Code | ||
| M1B | 43.806686 | -79.194353 |
| M1C | 43.784535 | -79.160497 |
| M1E | 43.763573 | -79.188711 |
| M1G | 43.770992 | -79.216917 |
| M1H | 43.773136 | -79.239476 |
| ... | ... | ... |
| M9N | 43.706876 | -79.518188 |
| M9P | 43.696319 | -79.532242 |
| M9R | 43.688905 | -79.554724 |
| M9V | 43.739416 | -79.588437 |
| M9W | 43.706748 | -79.594054 |
103 rows × 2 columns
Now if we make the indexes the same for the two dataframes we will have the new columns automatically take on the correct values for the postal codes.
df_scrape = df_scrape.set_index('PostalCode')
df_scrape['Latitude'], df_scrape['Longitude'] = df_PScodes['Latitude'], df_PScodes['Longitude']
df_scrape
| Borough | Neighborhood | Latitude | Longitude | |
|---|---|---|---|---|
| PostalCode | ||||
| M3A | North York | Parkwoods | 43.753259 | -79.329656 |
| M4A | North York | Victoria Village | 43.725882 | -79.315572 |
| M5A | Downtown Toronto | Regent Park,Harbourfront | 43.654260 | -79.360636 |
| M6A | North York | Lawrence Manor,Lawrence Heights | 43.718518 | -79.464763 |
| M9A | Etobicoke | Islington Avenue | 43.667856 | -79.532242 |
| ... | ... | ... | ... | ... |
| M5X | Downtown Toronto | First Canadian Place,Underground city | 43.648429 | -79.382280 |
| M8X | Etobicoke | The Kingsway,Montgomery Road,Old Mill North | 43.653654 | -79.506944 |
| M4Y | Downtown Toronto | Church and Wellesley | 43.665860 | -79.383160 |
| M8Y | Etobicoke | Old Mill South,King's Mill Park,Sunnylea,Humbe... | 43.636258 | -79.498509 |
| M8Z | Etobicoke | Mimico NW,The Queensway West,South of Bloor,Ki... | 43.628841 | -79.520999 |
99 rows × 4 columns
df_scrape.reset_index(inplace=True)
df_scrape
| PostalCode | Borough | Neighborhood | Latitude | Longitude | |
|---|---|---|---|---|---|
| 0 | M3A | North York | Parkwoods | 43.753259 | -79.329656 |
| 1 | M4A | North York | Victoria Village | 43.725882 | -79.315572 |
| 2 | M5A | Downtown Toronto | Regent Park,Harbourfront | 43.654260 | -79.360636 |
| 3 | M6A | North York | Lawrence Manor,Lawrence Heights | 43.718518 | -79.464763 |
| 4 | M9A | Etobicoke | Islington Avenue | 43.667856 | -79.532242 |
| ... | ... | ... | ... | ... | ... |
| 94 | M5X | Downtown Toronto | First Canadian Place,Underground city | 43.648429 | -79.382280 |
| 95 | M8X | Etobicoke | The Kingsway,Montgomery Road,Old Mill North | 43.653654 | -79.506944 |
| 96 | M4Y | Downtown Toronto | Church and Wellesley | 43.665860 | -79.383160 |
| 97 | M8Y | Etobicoke | Old Mill South,King's Mill Park,Sunnylea,Humbe... | 43.636258 | -79.498509 |
| 98 | M8Z | Etobicoke | Mimico NW,The Queensway West,South of Bloor,Ki... | 43.628841 | -79.520999 |
99 rows × 5 columns
First we will get a general feel for Toronto by plotting all the areas on a Toronto map
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
address = 'Toronto, Canada'
latitude = df_scrape['Latitude'].mean()
longitude = df_scrape['Longitude'].mean()
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))
The geograpical coordinate of Toronto are 43.70672879191919, -79.39601043535355.
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11.4)
# add markers to map
for lat, lng, borough, neighborhood in zip(df_scrape['Latitude'], df_scrape['Longitude'], df_scrape['Borough'], df_scrape['Neighborhood']):
label = '{}, {}'.format(neighborhood, borough)
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
[lat, lng],
radius=5,
popup=label,
color='blue',
fill=True,
fill_color='#3186cc',
fill_opacity=0.7,
parse_html=False).add_to(map_toronto)
map_toronto
The following code will take the coordinates, and call Foursquare to see what venues in a 1300m radius.
def getNearbyVenues(names, latitudes, longitudes, radius=1300):
venues_list=[]
for name, lat, lng in zip(names, latitudes, longitudes):
print(name)
# create the API request URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
lat,
lng,
radius,
LIMIT)
# make the GET request
results = requests.get(url).json()["response"]['groups'][0]['items']
# return only relevant information for each nearby venue
venues_list.append([(
name,
lat,
lng,
v['venue']['name'],
v['venue']['location']['lat'],
v['venue']['location']['lng'],
v['venue']['categories'][0]['name']) for v in results])
nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
nearby_venues.columns = ['Neighborhood',
'Neighborhood Latitude',
'Neighborhood Longitude',
'Venue',
'Venue Latitude',
'Venue Longitude',
'Venue Category']
return(nearby_venues)
toronto_venues = getNearbyVenues(names=df_scrape['PostalCode'],
latitudes=df_scrape['Latitude'],
longitudes=df_scrape['Longitude']
)
toronto_venues.shape
(2739, 7)
toronto_venues.head()
| Neighborhood | Neighborhood Latitude | Neighborhood Longitude | Venue | Venue Latitude | Venue Longitude | Venue Category | |
|---|---|---|---|---|---|---|---|
| 0 | M3A | 43.753259 | -79.329656 | Allwyn's Bakery | 43.759840 | -79.324719 | Caribbean Restaurant |
| 1 | M3A | 43.753259 | -79.329656 | Brookbanks Park | 43.751976 | -79.332140 | Park |
| 2 | M3A | 43.753259 | -79.329656 | Tim Hortons | 43.760668 | -79.326368 | Café |
| 3 | M3A | 43.753259 | -79.329656 | Donalda Golf & Country Club | 43.752816 | -79.342741 | Golf Course |
| 4 | M3A | 43.753259 | -79.329656 | Bruno's valu-mart | 43.746143 | -79.324630 | Grocery Store |
len(toronto_venues['Venue'].unique())
1794
map_toronto_venues = folium.Map(location=[latitude, longitude], zoom_start=11.4)
# add markers to map
for lat, lng, venue, neighborhood in zip(toronto_venues['Venue Latitude'], toronto_venues['Venue Longitude'], toronto_venues['Venue'], toronto_venues['Neighborhood']):
label = '{}, {}'.format(neighborhood, venue)
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
[lat, lng],
radius=5,
popup=label,
color='blue',
fill=True,
fill_color='#3186cc',
fill_opacity=0.7,
parse_html=False).add_to(map_toronto_venues)
map_toronto_venues
toronto_venues.groupby('Neighborhood').count()
| Neighborhood Latitude | Neighborhood Longitude | Venue | Venue Latitude | Venue Longitude | Venue Category | |
|---|---|---|---|---|---|---|
| Neighborhood | ||||||
| M1B | 29 | 29 | 29 | 29 | 29 | 29 |
| M1C | 8 | 8 | 8 | 8 | 8 | 8 |
| M1E | 28 | 28 | 28 | 28 | 28 | 28 |
| M1G | 20 | 20 | 20 | 20 | 20 | 20 |
| M1H | 30 | 30 | 30 | 30 | 30 | 30 |
| ... | ... | ... | ... | ... | ... | ... |
| M9N | 30 | 30 | 30 | 30 | 30 | 30 |
| M9P | 30 | 30 | 30 | 30 | 30 | 30 |
| M9R | 30 | 30 | 30 | 30 | 30 | 30 |
| M9V | 20 | 20 | 20 | 20 | 20 | 20 |
| M9W | 7 | 7 | 7 | 7 | 7 | 7 |
99 rows × 6 columns
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))
There are 275 uniques categories.
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']
# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.head()
| Zoo Exhibit | Afghan Restaurant | African Restaurant | Airport | Airport Lounge | American Restaurant | Amphitheater | Antique Shop | Aquarium | Arcade | ... | Transportation Service | Turkish Restaurant | Vegetarian / Vegan Restaurant | Video Game Store | Vietnamese Restaurant | Warehouse Store | Wine Bar | Wings Joint | Women's Store | Yoga Studio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 275 columns
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped
| Neighborhood | Zoo Exhibit | Afghan Restaurant | African Restaurant | Airport | Airport Lounge | American Restaurant | Amphitheater | Antique Shop | Aquarium | ... | Transportation Service | Turkish Restaurant | Vegetarian / Vegan Restaurant | Video Game Store | Vietnamese Restaurant | Warehouse Store | Wine Bar | Wings Joint | Women's Store | Yoga Studio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | M1B | 0.068966 | 0.0 | 0.034483 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.034483 | 0.00 | 0.0 | 0.0 | 0.000000 | 0.034483 | 0.000000 |
| 1 | M1C | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.000000 | 0.00 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
| 2 | M1E | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.000000 | 0.00 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
| 3 | M1G | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.000000 | 0.05 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
| 4 | M1H | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.000000 | 0.00 | 0.0 | 0.0 | 0.033333 | 0.000000 | 0.033333 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 94 | M9N | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.000000 | 0.00 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
| 95 | M9P | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.000000 | 0.00 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
| 96 | M9R | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.033333 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.000000 | 0.00 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
| 97 | M9V | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.000000 | 0.00 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
| 98 | M9W | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.000000 | 0.00 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
99 rows × 275 columns
toronto_grouped.shape
(99, 275)
num_top_venues = 5
for hood in toronto_grouped['Neighborhood']:
print("----"+hood+"----")
temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
temp.columns = ['venue','freq']
temp = temp.iloc[1:]
temp['freq'] = temp['freq'].astype(float)
temp = temp.round({'freq': 2})
print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
print('\n')
----M1B----
venue freq
0 Zoo Exhibit 0.07
1 Fast Food Restaurant 0.07
2 Restaurant 0.07
3 Trail 0.07
4 Park 0.03
----M1C----
venue freq
0 Breakfast Spot 0.12
1 Hotel 0.12
2 Playground 0.12
3 Burger Joint 0.12
4 Gym / Fitness Center 0.12
----M1E----
venue freq
0 Pizza Place 0.11
1 Fast Food Restaurant 0.07
2 Park 0.07
3 Bank 0.07
4 Automotive Shop 0.04
----M1G----
venue freq
0 Pizza Place 0.15
1 Coffee Shop 0.10
2 Fast Food Restaurant 0.10
3 Park 0.10
4 Music Store 0.05
----M1H----
venue freq
0 Indian Restaurant 0.10
1 Bank 0.07
2 Sandwich Place 0.07
3 Restaurant 0.07
4 Gas Station 0.07
----M1J----
venue freq
0 Sandwich Place 0.16
1 Pharmacy 0.12
2 Bank 0.08
3 Ice Cream Shop 0.08
4 Fish & Chips Shop 0.04
----M1K----
venue freq
0 Chinese Restaurant 0.13
1 Coffee Shop 0.13
2 Fast Food Restaurant 0.10
3 Discount Store 0.10
4 Pizza Place 0.07
----M1L----
venue freq
0 Coffee Shop 0.10
1 Intersection 0.07
2 Bus Line 0.07
3 Convenience Store 0.07
4 Bakery 0.07
----M1M----
venue freq
0 Harbor / Marina 0.13
1 Pizza Place 0.10
2 Park 0.10
3 Sandwich Place 0.07
4 Pharmacy 0.07
----M1N----
venue freq
0 Park 0.17
1 Thai Restaurant 0.08
2 Restaurant 0.08
3 Ice Cream Shop 0.08
4 Café 0.08
----M1P----
venue freq
0 Coffee Shop 0.13
1 Indian Restaurant 0.07
2 Chinese Restaurant 0.07
3 Pharmacy 0.07
4 Restaurant 0.07
----M1R----
venue freq
0 Pizza Place 0.13
1 Middle Eastern Restaurant 0.13
2 Burger Joint 0.07
3 Korean Restaurant 0.03
4 Fish Market 0.03
----M1S----
venue freq
0 Chinese Restaurant 0.13
1 Caribbean Restaurant 0.07
2 Bakery 0.07
3 Breakfast Spot 0.03
4 Shopping Mall 0.03
----M1T----
venue freq
0 Fast Food Restaurant 0.13
1 Sandwich Place 0.07
2 Pharmacy 0.07
3 Bank 0.07
4 Falafel Restaurant 0.07
----M1V----
venue freq
0 Chinese Restaurant 0.13
1 Dessert Shop 0.10
2 Bubble Tea Shop 0.10
3 Bakery 0.07
4 Coffee Shop 0.07
----M1W----
venue freq
0 Chinese Restaurant 0.13
1 Coffee Shop 0.10
2 Fast Food Restaurant 0.10
3 Bakery 0.07
4 Pizza Place 0.07
----M1X----
venue freq
0 Donut Shop 1.0
1 Zoo Exhibit 0.0
2 Music Store 0.0
3 Office 0.0
4 Noodle House 0.0
----M2H----
venue freq
0 Park 0.17
1 Pharmacy 0.10
2 Bank 0.07
3 Sandwich Place 0.07
4 Coffee Shop 0.07
----M2J----
venue freq
0 Coffee Shop 0.10
1 Bank 0.07
2 Clothing Store 0.07
3 Bakery 0.07
4 Juice Bar 0.07
----M2K----
venue freq
0 Bank 0.14
1 Gas Station 0.14
2 Park 0.07
3 Café 0.07
4 Chinese Restaurant 0.07
----M2L----
venue freq
0 Park 0.14
1 Furniture / Home Store 0.09
2 Coffee Shop 0.09
3 Cafeteria 0.05
4 Baseball Field 0.05
----M2M----
venue freq
0 Korean Restaurant 0.23
1 Café 0.17
2 Coffee Shop 0.13
3 Bank 0.07
4 Hookah Bar 0.03
----M2N----
venue freq
0 Korean Restaurant 0.10
1 Pizza Place 0.07
2 Sushi Restaurant 0.07
3 Grocery Store 0.07
4 Café 0.07
----M2P----
venue freq
0 Coffee Shop 0.23
1 Grocery Store 0.07
2 Gas Station 0.07
3 Sandwich Place 0.07
4 Restaurant 0.07
----M2R----
venue freq
0 Coffee Shop 0.17
1 Pizza Place 0.10
2 Sandwich Place 0.07
3 Pharmacy 0.07
4 Grocery Store 0.07
----M3A----
venue freq
0 Park 0.10
1 Skating Rink 0.07
2 Bus Stop 0.07
3 Pharmacy 0.07
4 Coffee Shop 0.07
----M3B----
venue freq
0 Japanese Restaurant 0.10
1 Coffee Shop 0.07
2 Italian Restaurant 0.07
3 Middle Eastern Restaurant 0.07
4 Supermarket 0.07
----M3C----
venue freq
0 Coffee Shop 0.07
1 Middle Eastern Restaurant 0.07
2 History Museum 0.03
3 Bike Shop 0.03
4 Sporting Goods Shop 0.03
----M3H----
venue freq
0 Park 0.07
1 Coffee Shop 0.07
2 Bank 0.07
3 Ice Cream Shop 0.03
4 Ski Chalet 0.03
----M3J----
venue freq
0 Coffee Shop 0.27
1 Pizza Place 0.10
2 Furniture / Home Store 0.07
3 Gas Station 0.07
4 Fast Food Restaurant 0.07
----M3K----
venue freq
0 Coffee Shop 0.10
1 Sandwich Place 0.07
2 Turkish Restaurant 0.07
3 Italian Restaurant 0.07
4 Racetrack 0.03
----M3L----
venue freq
0 Park 0.50
1 Bank 0.25
2 Pizza Place 0.25
3 Zoo Exhibit 0.00
4 Music Store 0.00
----M3M----
venue freq
0 Gas Station 0.12
1 Vietnamese Restaurant 0.12
2 Pharmacy 0.08
3 Park 0.08
4 Coffee Shop 0.08
----M3N----
venue freq
0 Hotel 0.10
1 Pizza Place 0.07
2 Fast Food Restaurant 0.07
3 Pharmacy 0.07
4 Gas Station 0.07
----M4A----
venue freq
0 Coffee Shop 0.10
1 Park 0.10
2 Grocery Store 0.07
3 Portuguese Restaurant 0.03
4 Office 0.03
----M4B----
venue freq
0 Pizza Place 0.07
1 Brewery 0.07
2 Gym / Fitness Center 0.07
3 Fast Food Restaurant 0.07
4 Coffee Shop 0.07
----M4C----
venue freq
0 Gastropub 0.10
1 Coffee Shop 0.10
2 Thai Restaurant 0.07
3 Pizza Place 0.07
4 Sandwich Place 0.07
----M4E----
venue freq
0 Beach 0.10
1 Pub 0.07
2 Coffee Shop 0.07
3 Breakfast Spot 0.07
4 BBQ Joint 0.07
----M4G----
venue freq
0 Bakery 0.10
1 Grocery Store 0.07
2 Coffee Shop 0.07
3 Sushi Restaurant 0.07
4 Brewery 0.03
----M4H----
venue freq
0 Indian Restaurant 0.07
1 Coffee Shop 0.07
2 Sandwich Place 0.07
3 Grocery Store 0.07
4 Burger Joint 0.07
----M4J----
venue freq
0 Pizza Place 0.07
1 American Restaurant 0.07
2 Bakery 0.07
3 Gastropub 0.07
4 Coffee Shop 0.07
----M4K----
venue freq
0 Greek Restaurant 0.13
1 Pub 0.07
2 Ice Cream Shop 0.07
3 Italian Restaurant 0.07
4 Bakery 0.07
----M4L----
venue freq
0 Brewery 0.10
1 Park 0.10
2 Indian Restaurant 0.07
3 Coffee Shop 0.07
4 Café 0.07
----M4M----
venue freq
0 Bakery 0.10
1 Italian Restaurant 0.07
2 Café 0.07
3 Coffee Shop 0.07
4 Fish Market 0.03
----M4N----
venue freq
0 Sushi Restaurant 0.10
1 Italian Restaurant 0.10
2 Coffee Shop 0.10
3 Bakery 0.07
4 Bank 0.07
----M4P----
venue freq
0 Italian Restaurant 0.13
1 Café 0.10
2 Coffee Shop 0.10
3 Yoga Studio 0.07
4 Bookstore 0.07
----M4R----
venue freq
0 Italian Restaurant 0.10
1 Yoga Studio 0.07
2 Bookstore 0.07
3 Coffee Shop 0.07
4 Café 0.03
----M4S----
venue freq
0 Italian Restaurant 0.13
1 Coffee Shop 0.10
2 Restaurant 0.07
3 Bookstore 0.07
4 Dessert Shop 0.07
----M4T----
venue freq
0 Italian Restaurant 0.17
1 Park 0.13
2 Sushi Restaurant 0.10
3 Spa 0.07
4 Grocery Store 0.07
----M4V----
venue freq
0 Italian Restaurant 0.10
1 Sushi Restaurant 0.10
2 Liquor Store 0.07
3 Café 0.07
4 Spa 0.07
----M4W----
venue freq
0 Park 0.13
1 Coffee Shop 0.10
2 Italian Restaurant 0.10
3 Spa 0.07
4 Juice Bar 0.03
----M4X----
venue freq
0 Park 0.10
1 Café 0.07
2 Japanese Restaurant 0.07
3 Gastropub 0.07
4 Dance Studio 0.03
----M4Y----
venue freq
0 Coffee Shop 0.10
1 Dance Studio 0.07
2 Bookstore 0.07
3 Men's Store 0.07
4 Japanese Restaurant 0.07
----M5A----
venue freq
0 Coffee Shop 0.20
1 Park 0.10
2 Bakery 0.10
3 Breakfast Spot 0.07
4 Pub 0.03
----M5B----
venue freq
0 Coffee Shop 0.07
1 Café 0.07
2 Theater 0.07
3 Clothing Store 0.03
4 Burger Joint 0.03
----M5C----
venue freq
0 Gastropub 0.10
1 Coffee Shop 0.10
2 Farmers Market 0.07
3 Café 0.07
4 Restaurant 0.03
----M5E----
venue freq
0 Coffee Shop 0.13
1 Park 0.07
2 Japanese Restaurant 0.07
3 Farmers Market 0.07
4 Bakery 0.03
----M5G----
venue freq
0 Coffee Shop 0.13
1 Park 0.10
2 Plaza 0.07
3 Theater 0.07
4 Yoga Studio 0.03
----M5H----
venue freq
0 Coffee Shop 0.10
1 Café 0.10
2 Theater 0.07
3 Gym 0.07
4 Fast Food Restaurant 0.03
----M5J----
venue freq
0 Hotel 0.07
1 Park 0.07
2 Plaza 0.07
3 History Museum 0.03
4 Café 0.03
----M5K----
venue freq
0 Café 0.13
1 Hotel 0.10
2 Coffee Shop 0.07
3 Restaurant 0.07
4 American Restaurant 0.07
----M5L----
venue freq
0 Café 0.10
1 American Restaurant 0.07
2 Seafood Restaurant 0.07
3 Restaurant 0.07
4 Vegetarian / Vegan Restaurant 0.07
----M5M----
venue freq
0 Coffee Shop 0.07
1 Fast Food Restaurant 0.07
2 Bakery 0.07
3 Italian Restaurant 0.07
4 Restaurant 0.03
----M5N----
venue freq
0 Coffee Shop 0.13
1 Skating Rink 0.07
2 Bank 0.07
3 Japanese Restaurant 0.07
4 Italian Restaurant 0.07
----M5P----
venue freq
0 Italian Restaurant 0.13
1 Café 0.10
2 Middle Eastern Restaurant 0.07
3 Gastropub 0.07
4 Japanese Restaurant 0.07
----M5R----
venue freq
0 Italian Restaurant 0.07
1 Vegetarian / Vegan Restaurant 0.07
2 Coffee Shop 0.07
3 Grocery Store 0.07
4 Café 0.07
----M5S----
venue freq
0 Bakery 0.07
1 Park 0.07
2 Vegetarian / Vegan Restaurant 0.07
3 Café 0.07
4 Food Truck 0.03
----M5T----
venue freq
0 Café 0.13
1 Burger Joint 0.07
2 Vegetarian / Vegan Restaurant 0.07
3 Yoga Studio 0.03
4 Farmers Market 0.03
----M5V----
venue freq
0 Park 0.20
1 Coffee Shop 0.13
2 Café 0.10
3 Harbor / Marina 0.07
4 Gym 0.07
----M5X----
venue freq
0 Café 0.10
1 Plaza 0.07
2 Gym 0.07
3 Pub 0.03
4 Basketball Stadium 0.03
----M6A----
venue freq
0 Restaurant 0.10
1 Furniture / Home Store 0.07
2 Fried Chicken Joint 0.07
3 Clothing Store 0.07
4 Athletics & Sports 0.03
----M6B----
venue freq
0 Fast Food Restaurant 0.10
1 Grocery Store 0.10
2 Coffee Shop 0.10
3 Gas Station 0.07
4 Sandwich Place 0.07
----M6C----
venue freq
0 Pizza Place 0.10
1 Coffee Shop 0.07
2 Caribbean Restaurant 0.07
3 Bank 0.07
4 Italian Restaurant 0.07
----M6E----
venue freq
0 Furniture / Home Store 0.07
1 Pizza Place 0.07
2 Pharmacy 0.07
3 Park 0.07
4 Bank 0.03
----M6G----
venue freq
0 Café 0.13
1 Cocktail Bar 0.07
2 Korean Restaurant 0.07
3 Coffee Shop 0.07
4 Grocery Store 0.07
----M6H----
venue freq
0 Café 0.30
1 Italian Restaurant 0.10
2 Bakery 0.10
3 Coffee Shop 0.10
4 Bar 0.07
----M6J----
venue freq
0 Bar 0.10
1 Vietnamese Restaurant 0.07
2 Asian Restaurant 0.07
3 Park 0.03
4 Beer Store 0.03
----M6K----
venue freq
0 Bakery 0.10
1 Furniture / Home Store 0.07
2 Coffee Shop 0.07
3 Restaurant 0.07
4 Bar 0.07
----M6L----
venue freq
0 Coffee Shop 0.14
1 Supermarket 0.10
2 Pizza Place 0.10
3 Vietnamese Restaurant 0.10
4 Gas Station 0.05
----M6M----
venue freq
0 Coffee Shop 0.13
1 Convenience Store 0.07
2 Fast Food Restaurant 0.07
3 Furniture / Home Store 0.07
4 Gas Station 0.07
----M6N----
venue freq
0 Brewery 0.10
1 Coffee Shop 0.10
2 Pizza Place 0.07
3 Liquor Store 0.07
4 Beer Store 0.07
----M6P----
venue freq
0 Café 0.13
1 Bar 0.13
2 Flea Market 0.07
3 Italian Restaurant 0.07
4 Gastropub 0.07
----M6R----
venue freq
0 Coffee Shop 0.07
1 Gift Shop 0.07
2 Eastern European Restaurant 0.07
3 Bar 0.07
4 Park 0.07
----M6S----
venue freq
0 Café 0.10
1 Italian Restaurant 0.10
2 Bakery 0.07
3 Coffee Shop 0.07
4 Sushi Restaurant 0.07
----M8V----
venue freq
0 Park 0.21
1 Bakery 0.03
2 Fast Food Restaurant 0.03
3 Supermarket 0.03
4 Mexican Restaurant 0.03
----M8W----
venue freq
0 Coffee Shop 0.13
1 Pizza Place 0.07
2 Toy / Game Store 0.07
3 Pharmacy 0.07
4 Bank 0.07
----M8X----
venue freq
0 Coffee Shop 0.10
1 Burger Joint 0.07
2 French Restaurant 0.07
3 Italian Restaurant 0.07
4 Dessert Shop 0.07
----M8Y----
venue freq
0 Park 0.15
1 Italian Restaurant 0.12
2 Coffee Shop 0.12
3 Sushi Restaurant 0.08
4 Ice Cream Shop 0.04
----M8Z----
venue freq
0 Restaurant 0.10
1 Yoga Studio 0.07
2 Coffee Shop 0.07
3 Bank 0.07
4 Sushi Restaurant 0.07
----M9A----
venue freq
0 Pharmacy 0.14
1 Park 0.09
2 Grocery Store 0.09
3 Bank 0.09
4 Japanese Restaurant 0.05
----M9B----
venue freq
0 Park 0.16
1 Pizza Place 0.08
2 Intersection 0.08
3 Hotel 0.04
4 Clothing Store 0.04
----M9C----
venue freq
0 Convenience Store 0.10
1 Pizza Place 0.07
2 Pet Store 0.07
3 Café 0.07
4 Coffee Shop 0.07
----M9L----
venue freq
0 Electronics Store 0.15
1 Park 0.15
2 Sports Bar 0.05
3 Arts & Crafts Store 0.05
4 Food Court 0.05
----M9M----
venue freq
0 Golf Course 0.2
1 Coffee Shop 0.2
2 Convenience Store 0.1
3 Gas Station 0.1
4 Discount Store 0.1
----M9N----
venue freq
0 Gas Station 0.10
1 Grocery Store 0.07
2 Furniture / Home Store 0.07
3 Train Station 0.07
4 Pizza Place 0.03
----M9P----
venue freq
0 Pizza Place 0.07
1 Grocery Store 0.07
2 Restaurant 0.07
3 Coffee Shop 0.07
4 Gas Station 0.07
----M9R----
venue freq
0 Pizza Place 0.13
1 Pharmacy 0.10
2 Coffee Shop 0.10
3 Shopping Mall 0.07
4 Beer Store 0.07
----M9V----
venue freq
0 Pizza Place 0.15
1 Grocery Store 0.15
2 Coffee Shop 0.10
3 Sandwich Place 0.05
4 Café 0.05
----M9W----
venue freq
0 Coffee Shop 0.29
1 Casino 0.14
2 Drugstore 0.14
3 Mediterranean Restaurant 0.14
4 Gas Station 0.14
def return_most_common_venues(row, num_top_venues):
row_categories = row.iloc[1:]
row_categories_sorted = row_categories.sort_values(ascending=False)
return row_categories_sorted.index.values[0:num_top_venues]
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
try:
columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
except:
columns.append('{}th Most Common Venue'.format(ind+1))
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']
for ind in np.arange(toronto_grouped.shape[0]):
neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
neighborhoods_venues_sorted.head()
| Neighborhood | 1st Most Common Venue | 2nd Most Common Venue | 3rd Most Common Venue | 4th Most Common Venue | 5th Most Common Venue | 6th Most Common Venue | 7th Most Common Venue | 8th Most Common Venue | 9th Most Common Venue | 10th Most Common Venue | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | M1B | Zoo Exhibit | Trail | Restaurant | Fast Food Restaurant | Supermarket | Spa | Bus Station | Caribbean Restaurant | Chinese Restaurant | Pizza Place |
| 1 | M1C | Park | Playground | Breakfast Spot | Gym / Fitness Center | Italian Restaurant | Hotel | Burger Joint | Dumpling Restaurant | Dog Run | Doner Restaurant |
| 2 | M1E | Pizza Place | Park | Bank | Fast Food Restaurant | Liquor Store | Food & Drink Shop | Supermarket | Sports Bar | Laundromat | Beer Store |
| 3 | M1G | Pizza Place | Coffee Shop | Park | Fast Food Restaurant | Music Store | Discount Store | Sandwich Place | Supermarket | Department Store | Indian Restaurant |
| 4 | M1H | Indian Restaurant | Sandwich Place | Restaurant | Bank | Coffee Shop | Gas Station | Yoga Studio | Thai Restaurant | Fish & Chips Shop | Music Store |
After testing several numbers of clusters it was found that 10 clusters gave a good distribution
# set number of clusters
kclusters = 10
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]
array([0, 9, 0, 6, 0, 0, 6, 4, 0, 8], dtype=int32)
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = df_scrape
# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='PostalCode')
toronto_merged.head() # check the last columns!
| PostalCode | Borough | Neighborhood | Latitude | Longitude | Cluster Labels | 1st Most Common Venue | 2nd Most Common Venue | 3rd Most Common Venue | 4th Most Common Venue | 5th Most Common Venue | 6th Most Common Venue | 7th Most Common Venue | 8th Most Common Venue | 9th Most Common Venue | 10th Most Common Venue | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | M3A | North York | Parkwoods | 43.753259 | -79.329656 | 8 | Park | Skating Rink | Pharmacy | Coffee Shop | Bus Stop | Chinese Restaurant | Laundry Service | Bank | Supermarket | Food & Drink Shop |
| 1 | M4A | North York | Victoria Village | 43.725882 | -79.315572 | 8 | Coffee Shop | Park | Grocery Store | Optical Shop | Shoe Store | Sandwich Place | Portuguese Restaurant | Gym | Pharmacy | Cosmetics Shop |
| 2 | M5A | Downtown Toronto | Regent Park,Harbourfront | 43.654260 | -79.360636 | 4 | Coffee Shop | Park | Bakery | Breakfast Spot | Historic Site | Thai Restaurant | Mediterranean Restaurant | Performing Arts Venue | Farmers Market | Spa |
| 3 | M6A | North York | Lawrence Manor,Lawrence Heights | 43.718518 | -79.464763 | 0 | Restaurant | Furniture / Home Store | Clothing Store | Fried Chicken Joint | Department Store | Electronics Store | Men's Store | Mediterranean Restaurant | Bowling Alley | Boutique |
| 4 | M9A | Etobicoke | Islington Avenue | 43.667856 | -79.532242 | 8 | Pharmacy | Bank | Grocery Store | Park | Ice Cream Shop | Spa | Bakery | Camera Store | Café | Japanese Restaurant |
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11.4)
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
folium.CircleMarker(
[lat, lon],
radius=5,
popup=label,
color=rainbow[cluster-1],
fill=True,
fill_color=rainbow[cluster-1],
fill_opacity=0.7).add_to(map_clusters)
map_clusters